ftp.cs.arizona.edu

home *** CD-ROM | disk | FTP | other *** search

/ ftp.cs.arizona.edu / ftp.cs.arizona.edu.tar / ftp.cs.arizona.edu / icon / newsgrp / group92c.txt / 000017_icon-group-sender _Mon Oct 12 17:36:04 1992.msg < prev next >

Wrap

Internet Message Format | 1993-01-04 | 24KB

Received: by cheltenham.cs.arizona.edu; Wed, 14 Oct 1992 04:08:19 MST Date: 12 Oct 92 17:36:04 GMT From: cis.ohio-state.edu!pacific.mps.ohio-state.edu!linac!uchinews!ellis!goer@ucbvax.Berkeley.EDU (Richard L. Goerwitz) Organization: University of Chicago Computing Organizations Subject: Re: confusing errors Message-Id: <1992Oct12.173604.3765@midway.uchicago.edu> References: <1992Oct12.140817.25199@midway.uchicago.edu> Sender: icon-group-request@cs.arizona.edu To: icon-group@cs.arizona.edu Status: R Errors-To: icon-group-errors@cs.arizona.edu Not so confusing after all, actually. I just realized that "link," "global," etc. are not "beginners." That is, they don't trigger semicolon insertion after a preceding complete expres- sion. They don't need to. Hence global hi procedure main(); end is a perfectly valid Icon program. If anybody find the Icon tokenizer interesting (automatic semicolon inser- tion is really a great idea - why don't all languages do it?), then here's a fun program. Those with some savvy will recognize at once how this sort of thing could be useful for anyone implementing an Icon preprocessor or file compressor. Of course there are bugs. It was just written, and I've only tried it on a few files: global next_c record TOK(sym, str) procedure main() local separator separator := "" every T := \iparse_tokens(&input) do { if any(&digits ++ &letters ++ "._", (\T.str)[1]) & \T.sym ~== "DOT" then writes(separator) writes(T.str) if any(&digits ++ &letters ++ "_.", (\T.str)[-1]) & \T.sym ~== "DOT" then separator := " " else separator := "" } end procedure iparse_tokens(stream, getchar) local elem, whitespace, token, primitives, reserveds static be_tbl, reserved_tbl, operators initial { # Primitive Tokens # primitives := [ ["identifier", "IDENT", "be"], ["integer-literal", "INTLIT", "be"], ["real-literal", "REALLIT", "be"], ["string-literal", "STRINGLIT", "be"], ["cset-literal", "CSETLIT", "be"], ["end-of-file", "EOFX", "" ]] # Reserved Words # reserveds := [ ["break", "BREAK", "be"], ["by", "BY", "" ], ["case", "CASE", "b" ], ["create", "CREATE", "b" ], ["default", "DEFAULT", "b" ], ["do", "DO", "" ], ["else", "ELSE", "" ], ["end", "END", "b" ], ["every", "EVERY", "b" ], ["fail", "FAIL", "be"], ["global", "GLOBAL", "" ], ["if", "IF", "b" ], ["initial", "INITIAL", "b" ], ["invocable", "INVOCABLE", "" ], ["link", "LINK", "" ], ["local", "LOCAL", "b" ], ["next", "NEXT", "be"], ["not", "NOT", "b" ], ["of", "OF", "" ], ["procedure", "PROCEDURE", "" ], ["record", "RECORD", "" ], ["repeat", "REPEAT", "b" ], ["return", "RETURN", "be"], ["static", "STATIC", "b" ], ["suspend", "SUSPEND", "be"], ["then", "THEN", "" ], ["to", "TO", "" ], ["until", "UNTIL", "b" ], ["while", "WHILE", "b" ]] # Operators # operators := [ [":=", "ASSIGN", "" ], ["@", "AT", "b" ], ["@:=", "AUGACT", "" ], ["&:=", "AUGAND", "" ], ["=:=", "AUGEQ", "" ], ["===:=", "AUGEQV", "" ], [">=:=", "AUGGE", "" ], [">:=", "AUGGT", "" ], ["<=:=", "AUGLE", "" ], ["<:=", "AUGLT", "" ], ["~=:=", "AUGNE", "" ], ["~===:=", "AUGNEQV", "" ], ["==:=", "AUGSEQ", "" ], [">>=:=", "AUGSGE", "" ], [">>:=", "AUGSGT", "" ], ["<<=:=", "AUGSLE", "" ], ["<<:=", "AUGSLT", "" ], ["~==:=", "AUGSNE", "" ], ["\\", "BACKSLASH", "b" ], ["!", "BANG", "b" ], ["|", "BAR", "b" ], ["^", "CARET", "b" ], ["^:=", "CARETASGN", "b" ], [":", "COLON", "" ], [",", "COMMA", "" ], ["||", "CONCAT", "b" ], ["||:=", "CONCATASGN","" ], ["&", "CONJUNC", "b" ], [".", "DOT", "b" ], ["--", "DIFF", "b" ], ["--:=", "DIFFASGN", "" ], ["===", "EQUIV", "b" ], ["**", "INTER", "b" ], ["**:=", "INTERASGN", "" ], ["{", "LBRACE", "b" ], ["[", "LBRACK", "b" ], ["|||", "LCONCAT", "b" ], ["|||:=", "LCONCATASGN","" ], ["==", "LEXEQ", "b" ], [">>=", "LEXGE", "" ], [">>", "LEXGT", "" ], ["<<=", "LEXLE", "" ], ["<<", "LEXLT", "" ], ["~==", "LEXNE", "b" ], ["(", "LPAREN", "b" ], ["-:", "MCOLON", "" ], ["-", "MINUS", "b" ], ["-:=", "MINUSASGN", "" ], ["%", "MOD", "" ], ["%:=", "MODASGN", "" ], ["~===", "NOTEQUIV", "b" ], ["=", "NUMEQ", "b" ], [">=", "NUMGE", "" ], [">", "NUMGT", "" ], ["<=", "NUMLE", "" ], ["<", "NUMLT", "" ], ["~=", "NUMNE", "b" ], ["+:", "PCOLON", "" ], ["+", "PLUS", "b" ], ["+:=", "PLUSASGN", "" ], ["?", "QMARK", "b" ], ["<-", "REVASSIGN", "" ], ["<->", "REVSWAP", "" ], ["}", "RBRACE", "e" ], ["]", "RBRACK", "e" ], [")", "RPAREN", "e" ], [";", "SEMICOL", "" ], ["?:=", "SCANASGN", "" ], ["/", "SLASH", "b" ], ["/:=", "SLASHASGN", "" ], ["*", "STAR", "b" ], ["*:=", "STARASGN", "" ], [":=:", "SWAP", "" ], ["~", "TILDE", "b" ], ["++", "UNION", "b" ], ["++:=", "UNIONASGN", "" ], ["$(", "LBRACE", "b" ], ["$)", "RBRACE", "e" ], ["$<", "LBRACK", "b" ], ["$>", "RBRACK", "e" ]] # static be_tbl, reserved_tbl reserved_tbl := table() every elem := !reserveds do insert(reserved_tbl, elem[1], elem[2]) be_tbl := table() every elem := !primitives | !reserveds | !operators do { insert(be_tbl, elem[2], elem[3]) } } /getchar := create ! (!stream || "\n") whitespace := ' \t' /next_c := @getchar repeat { case next_c of { "." : { # Could be a real literal *or* a dot operator. Check # following character to see if it's a digit. If so, # it's a real literal. We can only get away with # doing the dot here because it is not a substring of # any longer identifier. If this gets changed, we'll # have to move this code into do_operator(). # last_token := do_dot(getchar) suspend last_token # write(&errout, "next_c == ", image(next_c)) next } "\n" : { # If do_newline fails, it means we're at the end of # the input stream, and we should break out of the # repeat loop. # every last_token := do_newline(getchar, last_token, be_tbl) do suspend last_token if next_c === &null then break next } "\#" : { # Just a comment. Strip it by reading every character # up to the next newline. The global var next_c # should *always* == "\n" when this is done. # do_number_sign(getchar) # write(&errout, "next_c == ", image(next_c)) next } "\"" : { # Suspend as STRINGLIT everything from here up to the # next non-backslashed quotation mark, inclusive # (accounting for the _ line-continuation convention). # last_token := do_quotation_mark(getchar) suspend last_token # write(&errout, "next_c == ", image(next_c)) next } "'" : { # Suspend as CSETLIT everything from here up to the # next non-backslashed apostrophe, inclusive. # last_token := do_apostrophe(getchar) suspend last_token # write(&errout, "next_c == ", image(next_c)) next } default : { # If we get to here, we have either whitespace, an # integer or real literal, an identifier or reserved # word (both get handled by do_identifier), or an # operator. The question of which we have can be # determined largely just by checking the first # character. Whitespace begins with whitespace; # integer or real literals with digits, identifiers # and reserved words with underscores or letters, and # operators begin with everything not covered above. # if any(whitespace, next_c) then { # Like all of the TOK forming procedures, # do_whitespace resets next_c. do_whitespace(getchar, whitespace) # don't suspend any tokens next } if any(&digits, next_c) then { last_token := do_digits(getchar) suspend last_token next } if any(&letters ++ '_', next_c) then { last_token := do_identifier(getchar, reserved_tbl) suspend last_token next } # write(&errout, "it's an operator") last_token := do_operator(getchar, operators) suspend last_token next } } } # If stream argument is nonnull, then we are in the top-level # iparse_tokens(). If not, then we are in a recursive call, and # we should not emit all this end-of-file crap. # if \stream then { suspend TOK("EOFX") return TOK("$") } else fail end # # do_dot: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that produces the next # character from the input stream and t is a token record whose # sym field contains either "REALLIT" or "DOT". Essentially, # do_dot checks the next char on the input stream to see if it's # an integer. Since the preceding char was a dot, an integer # tips us off that we have a real literal. Otherwise, it's just # a dot operator. Note that do_dot resets next_c for the next # cycle through the main case loop in the calling procedure. # procedure do_dot(getchar) local token # global next_c # write(&errout, "it's a dot") # If dot's followed by a digit, then we have a real literal. # if any(&digits, next_c := @getchar) then { # write(&errout, "dot -> it's a real literal") token := "." || next_c while any(&digits, next_c := @getchar) do token ||:= next_c if token ||:= (next_c == ("e"|"E")) then { while (next_c := @getchar) == "0" while any(&digits, next_c) do { token ||:= next_c next_c = @getchar } } return TOK("REALLIT", token) } # Dot not followed by an integer; so we just have a dot operator, # and not a real literal. # # write(&errout, "dot -> just a plain dot") return TOK("DOT", ".") end # # do_newline: coexpression x TOK record x table -> TOK records # (getchar, last_token, be_tbl) -> Ts (a generator) # # Where getchar is the coexpression that returns the next # character from the input stream, last_token is the last TOK # record suspended by the calling procedure, be_tbl is a table of # tokens and their "beginner/ender" status, and Ts are TOK # records. Note that do_newline resets next_c. Do_newline is a # mess. What it does is check the last token suspended by the # calling procedure to see if it was a beginner or ender. It # then gets the next token by calling iparse_tokens again. If # the next token is a beginner and the last token is an ender, # then we have to suspend a SEMICOL token. In either event, both # the last and next token are suspended. # procedure do_newline(getchar, last_token, be_tbl) local next_token # global next_c # write(&errout, "it's a newline") # Go past any additional newlines. # while next_c == "\n" do { # NL can be the last char in the getchar stream; if it *is*, # then signal that it's time to break out of the repeat loop # in the calling procedure. # next_c := @getchar | { next_c := &null fail } } # If there was a last token (i.e. if a newline wasn't the first # character of significance in the input stream), then check to # see if it was an ender. If so, then check to see if the next # token is a beginner. If so, then suspend a TOK("SEMICOL",";") # record before suspending the next token. # if find("e", be_tbl[(\last_token).sym]) then { # write(&errout, "calling iparse_tokens via do_newline") # &trace := -1 if next_token := iparse_tokens(stream, getchar) then { # write(&errout, "call of iparse_tokens via do_newline yields ", # ximage(next_token)) if find("b", be_tbl[next_token.sym]) then suspend TOK("SEMICOL", ";") suspend next_token } else { # &trace := 0 fail } } # &trace := 0 end # # do_number_sign: coexpression -> &null # getchar -> # # Where getchar is the coexpression that pops characters off the # main input stream. Sets the global variable next_c. This # procedure simply reads characters until it gets a newline, then # returns with next_c == "\n". Since the preceding character was # a number sign, this has the effect of stripping comments. # procedure do_number_sign(getchar) # global next_c # write(&errout, "it's a number sign") while next_c ~== "\n" do { # NL can be the last char in the getchar stream; if it *is*, # then break out of the repeat loop next_c := @getchar | fail } # Return to calling procedure to cycle around again with the new # next_c already set. Next_c should always be "\n" at this point. return end # # do_quotation_mark: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that yields another character # from the input stream, and t is a TOK record with "STRINGLIT" # as its sym field. Puts everything upto and including the next # non-backslashed quotation mark into the str field. Handles the # underscore continuation convention. # procedure do_quotation_mark(getchar) local token # global next_c # write(&errout, "it's a string literal") token := "\"" while next_c := @getchar do { if next_c == "\n" & token[-1] == "_" then { token := token[1:-1] next } else { if slashupto("\"", token ||:= next_c, 2) then { next_c := @getchar # resume outermost (repeat) loop in calling procedure, # with the new (here explicitly set) next_c return TOK("STRINGLIT", token) } } } end # # do_apostrophe: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that yields another character # from the input stream, and t is a TOK record with "CSETLIT" # as its sym field. Puts everything upto and including the next # non-backslashed apostrope into the str field. # procedure do_apostrophe(getchar) local token # global next_c # write(&errout, "it's a cset literal") token := "'" while next_c := @getchar do { if slashupto("'", token ||:= next_c, 2) then { next_c := @getchar # Return & resume outermost containing loop in calling # procedure w/ new next_c. return TOK("CSETLIT", token) } } end # # do_digits: coexpression -> TOK record # getchar -> t # # Where getchar is the coexpression that produces the next char # on the input stream, and where t is a TOK record containing # either "REALLIT" or "INTLIT" in its sym field, and the text of # the numeric literal in its str field. # procedure do_digits(getchar) local token, tok_record # global next_c # Assume integer literal until proven otherwise.... tok_record := TOK("INTLIT") # write(&errout, "it's an integer or real literal") token := ("0" ~== next_c) | "" while (next_c := @getchar) == "0" while any(&digits, next_c) do { token ||:= next_c next_c := @getchar } if token ||:= (next_c == ("R"|"r")) then { while any(&digits, next_c := @getchar) do token ||:= next_c } else { if token ||:= (next_c == ".") then { while any(&digits, next_c := @getchar) do token ||:= next_c tok_record := TOK("REALLIT") } if token ||:= (next_c == ("e"|"E")) then { while any(&digits, next_c := @getchar) do token ||:= next_c tok_record := TOK("REALLIT") } } tok_record.str := ("" ~== token) | 0 return tok_record end # # do_whitespace: coexpression x cset -> &null # getchar x whitespace -> &null # # Where getchar is the coexpression producing the next char on # the input stream. Do_whitespace just repeats until it finds a # non-whitespace character, whitespace being defined as # membership of a given character in the whitespace argument (a # cset). # procedure do_whitespace(getchar, whitespace) # write(&errout, "it's junk") while any(whitespace, next_c) do next_c := @getchar return end # # do_identifier: coexpression x table -> TOK record # (getchar, reserved_tbl) -> t # # Where getchar is the coexpression that pops off characters from # the input stream, reserved_tbl is a table of reserved words # (keys = the string values, values = the names qua symbols in # the grammar), and t is a TOK record containing all subsequent # letters, digits, or underscores after next_c (which must be a # letter or underscore). Note that next_c is global and gets # reset by do_identifier. # procedure do_identifier(getchar, reserved_tbl) local token # global next_c # write(&errout, "it's an indentifier") token := next_c while any(&letters ++ &digits ++ '_', next_c := @getchar) do token ||:= next_c return TOK(\reserved_tbl[token], token) | TOK("IDENT", token) end # # do_operator: coexpression x list -> TOK record # getchar x operators -> t # # Where getchar is the coexpression that produces the next # character on the input stream, and t is a TOK record # describing the operator just scanned. Calls recognop, which # creates a DFSA to recognize valid Icon operators. Arg2 # (operators) is the list of valid Icon operators formed by the # calling procedure. # procedure do_operator(getchar, operators) local token, elem token := next_c # Go until recognop fails. while elem := recognop(operators, token, 1) do token ||:= (next_c := @getchar) # write(&errout, ximage(elem)) if *\elem = 1 then return TOK(elem[1][2], elem[1][1]) else fail end record dfstn_state(b, e, tbl) record start_state(b, e, tbl, master_list) procedure recognop(l, s, i) local current_state, master_list static dfstn_table initial dfstn_table := table() /i := 1 # See if we've created an automaton for l already. /dfstn_table[l] := start_state(1, *l, &null, &null) & { dfstn_table[l].master_list := sortf(l, i) } current_state := dfstn_table[l] # Save master_list, as current_state will change later on. master_list := current_state.master_list s ? { while c := move(1) do { # Null means that this part of the automaton isn't # complete. # if /current_state.tbl then create_arcs(master_list, i, current_state, &pos) # If the table has been clobbered, then there are no arcs # leading out of the current state. Fail. # if current_state.tbl === 0 then fail # write(&errout, "c = ", image(c)) # write(&errout, "table for current state = ", # ximage(current_state.tbl)) # If we get to here, the current state has arcs leading # out of it. See if c is one of them. If so, make the # node to which arc c is connected the current state. # Otherwise fail. # current_state := \current_state.tbl[c] | fail } } # Return possible completions. # result := list() every j := current_state.b to current_state.e do { if *master_list[j][i] = *s then put(result, master_list[j]) } # *result = 0 if nothing the right length is found return result end procedure create_arcs(master_list, field, current_state, POS) local elem, i, first_char, old_first_char current_state.tbl := table() old_first_char := "" every elem := master_list[i := current_state.b to current_state.e][field] do { # Get the first character for the current position (note that # we're one character behind the calling routine; hence # POS-1). # first_char := elem[POS-1] | next # If we have a new first character, create a new arc out of # the current state. # if first_char ~== old_first_char then { # Store the start position for the current character. current_state.tbl[first_char] := dfstn_state(i) # Store the end position for the old character. (\current_state.tbl[old_first_char]).e := i-1 old_first_char := first_char } } (\current_state.tbl[old_first_char]).e := i # Clobber table with 0 if no arcs were added. current_state.tbl := (*current_state.tbl = 0) return current_state end # # slashupto: cset x string x integer x integer -> integers # (c, s, i, j) -> Is (a generator) # where Is are the integer positions in s[i:j] before characters # in c that is not preceded by a backslash escape # procedure slashupto(c, s, i, j) if /s := &subject then /i := &pos else /i := 1 /j := *s + 1 /c := &cset c ++:= '\\' s[1:j] ? { tab(i) while tab(upto(c)) do { if ="\\" then { move(1) next } suspend .&pos move(1) } } end -- -Richard L. Goerwitz goer%midway@uchicago.bitnet goer@midway.uchicago.edu rutgers!oddjob!ellis!goer